import pandas as pd
import numpy as np
df=pd.read_csv('edw_launch_price_data.csv')
df.head()
df['micro_market_number']=df['micro_market'].str.extract('(\d+)')
df['developer_number']=df['developer'].str.extract('(\d+)')
df['project_number']=df['project'].str.extract('(\d+)')
df['launch_year'] = pd.DatetimeIndex(df['launch_date']).year
df['log_wap'] = np.log(df['wap'])
df['proj_bedrooms'].value_counts()
new_tags=['1,2,3','1,2,3,4','1,2,3,4','1,3','2,3','2,4','1,3,4','1,2,3,4,5','2,3,4,5','3,4','1,2,3,5','2,3,4','2,3,4']
old_tags=['3,1,2','4,1,2,3','2,3,4,1','3,1','3,2','4,2','4,1,3','2,3,4,5,1','5,2,3,4','4,3','2,3,5,1','4,2,3','3,4,2']
df["proj_bedrooms_clean"]=df["proj_bedrooms"]
df["proj_bedrooms_clean"]=df["proj_bedrooms_clean"].replace(old_tags,new_tags)
df["proj_bedrooms_clean"]=df["proj_bedrooms_clean"].str.split(',').str[0]
df["proj_bedrooms_clean"]=df["proj_bedrooms_clean"].replace(['1','2','3','4','5','6'],['1 and more','2 and more','3 and more','4 and more','5 and more','6 and more'])
df["proj_bedrooms_clean"].value_counts()
cols=['zone', 'micro_market_number', 'developer_number','project_number','wap','log_wap','proj_bedrooms_clean','unit_type','proj_launched_units','launch_year','construction_status','max_size','min_size','b1','b2','b3','b4','b5','b6','b7','b8']
df1=pd.DataFrame(df[cols]).fillna("0")
df1.head()
df1.info()
def type_converter (dataframe, col_name, data_type):
dataframe[col_name]=dataframe[col_name].astype(data_type)
for i in list(df1.columns)[-8:]:
type_converter (df1, i, 'int')
for i in ['zone','micro_market_number','proj_bedrooms_clean','project_number','unit_type','launch_year','construction_status']:
type_converter (df1, i, 'category')
df1.info()
df1['bp1']=df1['b1']/df1['proj_launched_units']
df1['bp2']=df1['b2']/df1['proj_launched_units']
df1['bp3']=df1['b3']/df1['proj_launched_units']
df1['bp4']=df1['b4']/df1['proj_launched_units']
df1['bp5']=df1['b5']/df1['proj_launched_units']
df1['bp6']=df1['b6']/df1['proj_launched_units']
df1['bp7']=df1['b7']/df1['proj_launched_units']
df1['bp8']=df1['b8']/df1['proj_launched_units']
df1['b48']=df1['b4']+df1['b5']+df1['b6']+df1['b7']+df1['b8']
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="darkgrid")
sns.set_color_codes("dark")
mm_prices=df1.groupby('micro_market_number')['wap'].mean()
mm_prices
The number of micromarkets is too large to be included as predictor variable. Let's group/cluster micromarkets according to mean prices.
mm_prices.shape
mm=mm_prices.values
mm=mm.reshape(-1, 1)
mm.shape
from sklearn.cluster import KMeans
#clustering formula
kmeans = KMeans(n_clusters=20, n_init=100, max_iter=1000,n_jobs=5)
# get the clusters
kmm = kmeans.fit_predict(mm)
#clustering metric
from sklearn.metrics import silhouette_score
print("Silhoutte Score of Micromarket Clusters:",silhouette_score(mm, kmm))
#cluster dataframe
kmm=pd.DataFrame(kmm)
kmm.columns=['mm_cluster']
#cluster number and number of projects for every micromarket
mm_proj=df1.groupby('micro_market_number')['project_number'].count().reset_index()
mm_proj=pd.concat([mm_proj, kmm],axis=1)
mm_proj
# Dictionary mapping micromarket number with cluster numbers
market_cluster = dict(zip(mm_proj['micro_market_number'], mm_proj['mm_cluster']))
# Create a new column in the dataset indicating cluster numbers, initialised with micromarket numbers
df1['mm_cluster']=df1['micro_market_number']
# Map cluster numbers to their respective micromarket numbers
df1['mm_cluster']=df1['mm_cluster'].map(market_cluster)
df1['mm_cluster'].value_counts()
# How many projects are there for each cluster?
kmm_proj=df1.groupby('mm_cluster')['project_number'].count().reset_index()
kmm_proj=pd.DataFrame(kmm_proj)
kmm_proj
# projects per cluster
sns.barplot(x="mm_cluster", y="project_number", data=kmm_proj)
Let's check stability of clusters by looking at how mean wap prices vary across clusters for every launch year.
We will compute Spearman's Rank Correlation to check for monotonic relations for the mean prices across clusters for all years.
year_mm_price=df1.groupby(['launch_year','mm_cluster'])['wap'].mean()
year_mm_price=pd.DataFrame(year_mm_price)
mean_wap_by_year=pd.concat([year_mm_price.loc[(2014)],year_mm_price.loc[(2015)],year_mm_price.loc[(2016)],year_mm_price.loc[(2017)],year_mm_price.loc[(2018)]],axis=1)
mean_wap_by_year.columns=['meanwap_2014','meanwap_2015','meanwap_2016','meanwap_2017','meanwap_2018']
mean_wap_by_year
from scipy.stats import spearmanr
rho, pval = spearmanr(mean_wap_by_year,nan_policy='omit') # rho - correlation coefficient, p-value - signifies monotonic relationship
rho = pd.DataFrame(rho)
rho.index=['meanwap_2014','meanwap_2015','meanwap_2016','meanwap_2017','meanwap_2018']
rho.columns=['meanwap_2014','meanwap_2015','meanwap_2016','meanwap_2017','meanwap_2018']
rho
Given the high value of Spearman's Rank Correlation coefficients, there's a strong monotonic relation between mean wap prices across clusters throughout various years. The clusters are indeed stable.
#Number of micromarkets in each cluster
kmm_micro=mm_proj.groupby('mm_cluster')['micro_market_number'].count().reset_index()
kmm_micro=pd.DataFrame(kmm_micro)
kmm_micro
# Visual representation of number of micromarkets in each cluster
sns.barplot(x="mm_cluster", y="micro_market_number", data=kmm_micro)
Note that the initial centroids of kmeans algorithms are not same every time the algorithm is run. Therefore, the micromarket frequency counts for each cluster will be different every time this notebook is run.
# wap against different clusters
f, ax = plt.subplots(figsize=(20,10))
sns.boxplot(x="mm_cluster", y="wap", data=df1)
Micromarket clustering explains the variation in weighted average prices well. Micromarket clusters is thus a good predictor for wap.
sns.catplot(x="mm_cluster", y="wap", col="zone", col_wrap=2, data=df1, kind="strip", height=5, aspect=1.1, palette='Set2')
Stripplots also show frequency counts of a class of a categorical data apart from showing how a continous target variable's tendency of centrality and variance on it.
sns.catplot(x="mm_cluster", y="wap", col="proj_bedrooms_clean",col_wrap=2, data=df1, kind="strip", height=5, aspect=1.4, palette='Set2')
Projects with greater number of bedrooms are pricier across all micromarkets. However, such projects have low frequency.
sns.catplot(x="mm_cluster", y="wap", col="launch_year", data=df1, col_wrap=2, kind="strip", height=5, aspect=1.2, palette='Set2')
Prices are time invariant for different clusters of micromarkets. Prices tend to decrease over time across all micromarket clusters.
sns.catplot(x="mm_cluster", y="wap", col="construction_status",col_wrap=2, data=df1, kind="strip", height=6, aspect=1, palette='Set2')
Clustering of micromarkets indeed explains price variations significantly. Significant variations of prices are visible across zones and different segments of other categorical variables as well. Since clustering/grouping of micromarkets was done based on mean prices, this shows that prices are also strongly influenced by other factors.
Let us begin by looking at frequency distributions of micromarket clusters and several categorical features. We begin with crosstab tables involving unit-type, project bedrooms and construction status.
sns.heatmap(pd.crosstab(df1['construction_status'],df1['unit_type']),annot=True, fmt='d',cmap='gist_gray')
sns.heatmap(pd.crosstab(df1['proj_bedrooms_clean'], df1['unit_type']),annot=True, fmt='d',cmap='gist_gray')
Number of apartments overshadow that of villas. Most of the projects are already understruction and consist of 1 and 2 bedrooms.
sns.heatmap(pd.crosstab(df1['proj_bedrooms_clean'],df1['construction_status']),annot=True, fmt='d',cmap='gist_gray',robust=True)
Now, let us look at frequency distributions of micromarket clusters across unit-types, number of project bedrooms and construction status of projects.
fig,ax=plt.subplots(figsize=(10,10))
fig=sns.heatmap(pd.crosstab(df1['mm_cluster'],df1['construction_status']),annot=True, fmt='d',cmap='Blues')
fig,ax=plt.subplots(figsize=(10,10))
fig=sns.heatmap(pd.crosstab(df1['mm_cluster'],df1['unit_type']),annot=True, fmt='d',cmap='copper')
fig,ax=plt.subplots(figsize=(10,10))
fig=sns.heatmap(pd.crosstab(df1['mm_cluster'],df1['proj_bedrooms_clean']),annot=True, fmt='d',cmap='copper')
Micromarket frequencies are not evenly distributed across all categories of unit-type, project bedrooms and construction.
fig,ax=plt.subplots(figsize=(10,10))
fig=sns.heatmap(pd.crosstab(df1['mm_cluster'],df1['zone']),annot=True, fmt='d',cmap='copper')
f, ax = plt.subplots(figsize=(10,5))
sns.boxplot(x="zone", y="wap",data=df1)
As shown just now, wap varies significant among different zones. Zone J is the costliest area due to highest mean price. Zone B is cheapest.
sns.catplot(x="zone", y="wap", col="proj_bedrooms_clean",col_wrap=2,data=df1, kind="bar", height=5, aspect=1.3, palette='Set3',ci='sd')
The barplots show mean prices while the error bar depicts variation in prices.
sns.catplot(x="zone", y="wap", col="unit_type",col_wrap=2, data=df1, kind="bar", height=6, aspect=1.1, palette='Set2')
Villas have far fewer occurences compared to apartments as most people are not wealthy enough to afford a villa. Villas do not occur in all zones.
sns.catplot(x="zone", y="wap", col="construction_status",col_wrap=2,data=df1, kind="bar", height=6, aspect=1.1)
Prices varies with project bedroom numbers, unit-types and construction status differently across different years.
f, ax = plt.subplots(figsize=(10,5))
sns.boxplot(x="launch_year", y="wap", data=df1,palette='Set1')
Prices are definitely not time invariant. Let's observe how they vary with time across different segments of categorical variables.
A point plot represents an estimate of central tendency for a numeric variable by the position of scatter plot points and provides some indication of the uncertainty around that estimate using error bars. We are using pointplots to analyse how prices vary across years for different categories of some predictor variables.
sns.catplot(x="launch_year", y="wap", col="proj_bedrooms_clean",col_wrap=2, data=df1, kind="point", height=5, aspect=1, color='darkgreen')
Generally, the greater the number of bedrooms, the pricier an apartment is. Prices declined over time in spite of temporary spike.
sns.catplot(x="launch_year", y="wap", col="construction_status",col_wrap=2, data=df1, kind="point", height=5, aspect=1, color='darkorange')
Projects that commenced earlier have definitely completed earlier. Very few projects are under pre-launch at the time of analysis.
sns.pointplot(x="launch_year", y="wap", hue="unit_type",data=df1, markers=["o",'*'], linestyles=["-",'--'])
Unit type definitely influences prices.
General trend is that prices decrease over time for all categories of a categorical feature. An exception is the pre-launch project prices. This is probably because speculated prices before a project commences are subjected to changes by various market factors.
sns.boxplot(x="unit_type", y="wap", data=df1, palette='Set1')
sns.boxplot(x="proj_bedrooms_clean", y="wap", data=df1, palette='Set1')
sns.boxplot(x="construction_status", y="wap", data=df1, palette='Set1')
Compared to project bedrooms and unit types, construction status seem to contribute to least variance of prices within interquartile range across its different categories.
sns.catplot(x="proj_bedrooms_clean", y="wap", col="construction_status",col_wrap=2, data=df1, kind="box", height=5, aspect=1, palette='Set3')
Flats under construction - those with 4 bedrooms are costliest. Completed flats- those with 3 bedrooms are costliest.
sns.catplot(x="proj_bedrooms_clean", y="wap", col="unit_type",col_wrap=2, data=df1, kind="box", height=5, aspect=1.1, palette='Set3')
Apartments- 4 bedrooms and more are costliest. Those with one bedrooms least expensive. Villa- Largely sparse. 2 bedrooms and more units are costlier. Those with one bedrooms least expensive.
sns.catplot(x="unit_type", y="wap", col="construction_status",col_wrap=2, data=df1, kind="box", height=5, aspect=1, palette='Set3')
Apartments show greater central tendency and variation in prices due to higher frequency counts and demand compared to villa whose individual unit is pricier as only economically elites can afford villas.
Now, let's look at how maxsize and minsize varies with wap across different segments of various categorical variables.
sns.lmplot(x="min_size", y="wap",col='zone',hue='zone',col_wrap=2, height=5, aspect=1, data=df1)
sns.lmplot(x="max_size", y="wap",col='zone',hue='zone',col_wrap=2, height=5, aspect=1, data=df1)
For different zones, variations in correlation between prices and min_size, as well as prices and max_size. Many of the predictor variables are strongly correlated with zone.
Strong multicollinearity between min_size and max_size suspected.
sns.lmplot(x="max_size", y="wap",col='launch_year',hue='launch_year',col_wrap=2, height=5, aspect=1, data=df1)
Max_size varies across years. It's sharpest increase is recorded in 2016.
sns.lmplot(x="max_size", y="wap",col='proj_bedrooms_clean',hue='proj_bedrooms_clean',col_wrap=2, height=5, aspect=1,data=df1)
Prices are invariant for all maximum sizes regardless of number of bedrooms.
sns.lmplot(x="max_size", y="wap",col='construction_status',hue='construction_status',col_wrap=2, height=5,aspect=1, data=df1)
sns.lmplot(x="max_size", y="wap",col='unit_type',hue='unit_type',col_wrap=2, height=5, aspect=1,data=df1)
Unit type and construction status across maximum sizes of projects also influence prices.
Max_size not affected by proj_bedrooms_clean as it seems.
# Line plots involving prices against bedroom by typology
f, ax = plt.subplots(4,2 ,figsize=(20,10))
sns.lineplot(x="bp1", y='wap', data=df1, color="black",ax=ax[0,0])
sns.lineplot(x="bp2", y='wap', data=df1, color="yellow",ax=ax[0,1])
sns.lineplot(x="bp3", y='wap', data=df1, color="green",ax=ax[1,0])
sns.lineplot(x="bp4", y='wap', data=df1, color="maroon",ax=ax[1,1])
sns.lineplot(x="bp5", y='wap', data=df1, color="orange",ax=ax[2,0])
sns.lineplot(x="bp6", y='wap', data=df1, color="darkblue",ax=ax[2,1])
sns.lineplot(x="bp7", y='wap', data=df1, color="red",ax=ax[3,0])
sns.lineplot(x="bp8", y='wap', data=df1, color="indigo",ax=ax[3,1])
Except for bp1, increasing trend between prices and bedrooms by typology per project. This is because number of bedrooms in an unit influences price. Figures portray noisy data due to sparsity of columns.
# Split up wap into intervals
df1['wap_interval'] = pd.cut(df1['wap'], bins=[0,5e+3,1e+4,3e+4,9e+4])
df1['wap_interval'].value_counts()
from scipy.stats import chi2_contingency
print("-----P-Values of two-way Chi-Square Test involving categorical predictors-----")
for i in ['construction_status','unit_type','proj_bedrooms_clean','launch_year','mm_cluster','zone']:
cont=pd.crosstab(df1['wap_interval'],df1[i]) #Contingency table
g, p, dof, expctd = chi2_contingency(cont)
print(i,": ", p)
Based on inferences from clustermap, certain pairs of variables are not advised to be used simultaneously in regression:
1) max_size and min_size
2) bp6 and bp7
3) bp5 and bp8
# Dendogram Linkage between continuous variables showing links based on correlation coefficients
include=['wap','max_size','min_size','bp1','bp2','bp3','bp4','bp5','bp6','bp7','bp8']
sns.clustermap(pd.DataFrame(df1[include]).corr(),annot=True, method='complete')# Complete linkage used for cluster analysis
It's desirable to have low correlations among independent variables.
Max_size and min_size should not be used simultaneously to avoid collinearity in regression model. Let's choose max_size instead first.
from sklearn.feature_selection import mutual_info_regression
print("-----P-Values of Mutual Information Regression Test involving continuous predictors-----")
for i in ['min_size','max_size','bp1','bp2','bp3','bp4','bp5','bp6','bp7','bp8']:
com=df1[i].values
com=com.reshape(-1,1)
mi=mutual_info_regression(com, df1['wap'])
print(i,": ", mi)
The mutual information score of two random variables is a measure of the mutual dependence between the two variables. The higher the value of that score, the higher the dependency.
More information on mutual information on mutual information found here: http://www.scholarpedia.org/article/Mutual_information
The continuous variables to be chosen are max_size, bp1, bp2 and bp3.
Let's fit all the categorical features discussed so far and the 4 selected numerical features into a linear regression model. Try with log-transformed price as dependent variable first. Log-transformation is known to minimise skewness and ensure normality in residual distribution.
import statsmodels.api as sm
import statsmodels.formula.api as smf
model = smf.ols(formula='log_wap ~ max_size + C(mm_cluster) + bp1 + bp2 + bp3 + C(launch_year) + C(unit_type) + C(proj_bedrooms_clean) + C(construction_status) + C(zone)', data=df1).fit()
print(model.summary())
Let's try cutting down number of categorical features and see if the adjusted R-squared values change or not. Construction status contributed least variance over prices. Drop it. Drop zone as well as clustered micromarkets are defined by zones as we have seen earlier.
mod1 = smf.ols(formula='log_wap ~ max_size + C(mm_cluster) + bp1 + bp2 + bp3 + C(launch_year) + C(unit_type) + C(proj_bedrooms_clean)', data=df1).fit()
print(mod1.summary())
We observe that no deterioration in adjusted R-square value.
mod = smf.ols(formula='wap ~ max_size + C(mm_cluster) + bp1 + bp2 + bp3 + C(launch_year) + C(unit_type) + C(proj_bedrooms_clean)', data=df1).fit()
print(mod.summary())
The standard errors are larger and the adjusted R-squared value is lower, suggesting greater degree of residuals.
def residual_log (model, dataframe, residual_log, residual):
# Function To Compute and return residual summary statistics for a model
predicted_lwap = model.predict()
predicted_wap=np.exp(predicted_lwap)
dataframe[residual_log]=dataframe['log_wap'] - predicted_lwap
dataframe[residual]=dataframe['wap'] - predicted_wap
return (dataframe[residual_log].describe(), print('median: ',dataframe[residual_log].median()))
def percentage_error_projects (dataframe, residual, percentage_error,percentage_error_interval):
#Calculate percentage error w.r.t wap
dataframe[percentage_error]=(dataframe[residual]/dataframe['wap'])*100
#Split percentage error into equal intervals
dataframe[percentage_error_interval] = pd.cut(dataframe[percentage_error], bins=[-float("inf"),-30,-20,-10,0,10,20,30,float("inf")])
# How many projects in each interval of wap?
grouped_df = dataframe.groupby(percentage_error_interval)['project_number'].count().reset_index()
# Percentage of projects under each interval
grouped_df['percentage_projects']=(grouped_df['project_number']/grouped_df['project_number'].sum())*100
return (grouped_df)
residual_log (mod1, df1, 'residual_log','residual') # Residual statistics for log-transformed proposed base model
pred_lwap = mod1.predict() #Predicted values of log-transformed wap
pred_wap=np.exp(pred_lwap) #Predicted log-transformed wap values transformed into absolute values
#Include values for predicted values of wap and log-wap derived from base model with log-transformation into dataframe
pred_lwap=pd.DataFrame(pred_lwap)
pred_wap=pd.DataFrame(pred_wap)
pred_lwap.columns=['pred_lwap']
pred_wap.columns=['pred_wap']
df1=pd.concat([df1,pred_lwap,pred_wap],axis=1)
df1.head()
percentage_error_projects (df1, 'residual', 'percentage_error','percentage_error_interval')
#Mean Absolute Percentage Error of log-transformed proposed base model
np.absolute(df1['percentage_error']).mean()
# Lets investigate normal distribution properties of log-transformed residual and absolute residuals of base model
f, ax = plt.subplots(1,2,figsize=(5,5))
sns.distplot(df1['residual'],color="y", ax=ax[0])
sns.distplot(df1['residual_log'],color="y",ax=ax[1])
print("Skewness of residual: %f" % df1['residual'].skew())
print("Kurtosis of residual: %f" % df1['residual'].kurt())
print("Skewness of residual_log: %f" % df1['residual_log'].skew())
print("Kurtosis of residual_log: %f" % df1['residual_log'].kurt())
Log-transformation reduces skewness and kurtosis of error distribution.
Let's look at the residual distribution of the first proposed base model with all the categorical features.
residual_log (model, df1, 'residual_log_model','residual_model') # Residual statistics for the first proposed base model
percentage_error_projects (df1, 'residual_model', 'percentage_error_model','percentage_error_interval_model')
#Mean Absolute Percentage Error of first proposed base model
np.absolute(df1['percentage_error_model']).mean()
sns.FacetGrid(df1, col="wap_interval").map(plt.hist, "residual_model")
Smaller instances of prediction errors at higher prices.
import scipy.stats as stats
fig,ax=plt.subplots(figsize=(10,10))
#Cook's Distance Plot for log transformed base model
fig=sm.graphics.influence_plot(mod, ax=ax, color='b')
#QQ Plot for absolute residual of log-transformed base model showing deviation from normal distribution
fig = sm.qqplot(df1['residual'], fit=True, line='45')
plt.show()
Cook's distance identifies outliers and leverage points in observations that shows their influence on regression model. QQ normal plot of a quantity signifies how well it fits a normal distribution.
fig,ax=plt.subplots(figsize=(10,10))
#Cook's Distance Plot for base model
fig=sm.graphics.influence_plot(mod1, ax=ax, color='b')
#QQ Plot for log-residual of base model showing deviation from normal distribution
fig = sm.qqplot(df1['residual_log'], fit=True, line='45')
plt.show()
Log-transformation of a skewed quantity will definitely aid in greater degree of normalisation. The residuals of the model with log-transformation shows greater resemblance with a normal distribution.
fig,ax=plt.subplots(figsize=(10,10))
#Cook's Distance Plot for first proposed base model
fig=sm.graphics.influence_plot(model, ax=ax, color='b')
#QQ Plot for log-residual of first proposed base model
fig = sm.qqplot(df1['residual_log_model'], fit=True, line='45')
plt.show()
Therefore, the model 'mod1' will be the base model due to greater degree of normality of its residuals as indicated by qqplot, skewness and kurtosis values in distribution table.
# Residual plots involving continuous variables in based model
f, ax = plt.subplots(2,2 ,figsize=(25,10))
sns.residplot(x="max_size", y='residual_log', data=df1, color="r",ax=ax[0,0])
sns.residplot(x="bp1", y='residual_log', data=df1, color="black",ax=ax[0,1])
sns.residplot(x="bp2", y='residual_log', data=df1, color="y",ax=ax[1,0])
sns.residplot(x="bp3", y='residual_log', data=df1, color="green",ax=ax[1,1])
include=['log_wap','max_size','mm_cluster', 'bp1','bp2','bp3', 'unit_type','proj_bedrooms_clean','launch_year']
df1_cv=pd.DataFrame(df1[include])
type_converter (df1_cv, 'mm_cluster', 'category')
df1_cv.head()
from sklearn.preprocessing import OneHotEncoder
#create an instance of one-hot encoding class
onehot_encoder = OneHotEncoder(sparse=False)
colmn=['mm_cluster','unit_type','proj_bedrooms_clean','launch_year']
onehot_encoded = onehot_encoder.fit_transform(df1_cv[colmn])
#Create and concatenate new features formed from one-hot encoding of categorical features
onehot_encoded=pd.DataFrame(onehot_encoded)
df1_cv=pd.concat([df1_cv,onehot_encoded],axis=1)
df1_cv=df1_cv.drop(colmn,axis=1)
df1_cv.head()
df1_cv.info()
from sklearn.model_selection import train_test_split
target = 'log_wap'
features = df1_cv.columns != target
x = df1_cv.loc[:,features]
y = df1_cv[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=8)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(x_train,y_train)
# cross validate
scores_reg = cross_validate(reg, x_train, y_train, cv=8, return_train_score=True,return_estimator=False)
scores_reg =pd.DataFrame(scores_reg)
scores_reg
print("Linear Regression Base Model Train Score:",scores_reg ['train_score'].mean())
print("Linear Regression Base Model Test Score:",scores_reg ['test_score'].mean())
amenities=df.loc[:,'am_water':'am_water_supply'] # yes-counts of individual amenities
amenities.count().sort_values(ascending=False)
amenities=amenities.fillna("No")
amenities.head()
def label_encoder(dataframe, col_name): # Function for label encoding categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(dataframe[col_name].unique())
dataframe[col_name] = le.transform(dataframe[col_name])
for i in list(amenities.columns): # Label-encoding of amenities features
label_encoder(amenities, i)
amenities['am_water']=amenities['am_water']+amenities['am_water_cons']+amenities['am_water_supply']+amenities['am_rain_harvest']
amenities['am_security']=amenities['am_security']+amenities['am_security_guards']+amenities['am_intercom']+amenities['am_cctv']
amenities['am_outdoor_game']=amenities['am_badmintorn']+amenities['am_basketball']+amenities['am_cric_pitch']+amenities['am_golf']+amenities['am_sports']+amenities['am_tennis']+amenities['am_swim']+amenities['am_jog']
amenities['am_indoor_game']=amenities['am_aerobics']+amenities['am_gym']+amenities['am_skating']+amenities['am_indoor_games']+amenities['am_squash']+amenities['am_spa']
amenities['am_sewage']=amenities['am_swer_trt_plt']+amenities['am_swr_chamber']+amenities['am_sold_waste']+amenities['am_storm_drain']+amenities['am_swr_trt_slug']
amenities['am_roads']=amenities['am_roads']+amenities['am_road_footpath']+amenities['am_streetlight']
amenities['am_cyber']=amenities['am_internet']
amenities['am_power']=amenities['am_energy_mgmt']+amenities['am_elec_meter_room']+amenities['am_ht_alarm']+amenities['am_transformer']+amenities['am_earthquake']+amenities['am_power_backup']
amenities['am_religion']=amenities['am_temple']+amenities['am_vaastu']
amenities['am_space']=amenities['am_ls_garden']+amenities['am_ls_tree']+amenities['am_child_play']+amenities['am_open_space']+amenities['am_amphi']+amenities['am_rec_facility']
amenities['am_finance']=amenities['am_atm']+amenities['am_business_center']
amenities['am_firefighting']=amenities['am_ff_system']+amenities['am_ff_req']
amenities['am_health_related']=amenities['am_health']+amenities['am_hosp']
amenities['am_car_parking']=amenities['am_open_carpark']+amenities['am_open_parking']+amenities['am_closed_carpark']+amenities['am_carpark']
amenities['am_education']=amenities['am_lib']+amenities['am_oth']+amenities['am_school']
amenities['am_maintenance']=amenities['am_maint_staff']+amenities['am_gated']+amenities['am_staffq']
amenities['am_social_gatherings']=amenities['am_commu_building']+amenities['am_commu_hall']+amenities['am_banquet']+amenities['am_party_hall']+amenities['am_club']+amenities['am_multi_room']+amenities['am_cafeteria']
amenities['am_lift']=amenities['am_lift']+amenities['am_service_lift']
amenities['am_shop']=amenities['am_shopmall']+amenities['am_utility_shop']
for i in ['am_water','am_security','am_outdoor_game','am_indoor_game','am_sewage','am_roads','am_cyber','am_power','am_religion','am_space','am_finance','am_firefighting','am_health_related','am_car_parking','am_education','am_maintenance','am_social_gatherings','am_lift','am_shop']:
amenities[i]=amenities[i].replace([2,3,4,5,6,7],[1,1,1,1,1,1]) # Using OR logic, replace counts of amenities in each
# cluster greater than one by 1 to signify if any amenity is present or not
for i in list(amenities.columns): # Convert amenities cluster features from integer to categorical
type_converter (amenities, i, 'category')
df1=pd.concat([df1,amenities],axis=1)
df1.head()
specifications=df.loc[:,'sp_balcony':'sp_cement'] # Yes-counts of specifications
specifications.count().sort_values(ascending=False)
specifications=specifications.fillna('No')
specifications.head()
specifications.info()
for i in list(specifications.columns):
specifications[i]=specifications[i].str.contains('No')
for i in list(specifications.columns):
label_encoder(specifications, i)
specifications[i] = specifications[i].replace([0,1],[1,0])
for i in list(specifications.columns):
type_converter (specifications, i, 'category')
df1=pd.concat([df1,specifications],axis=1)
df1.head()
Let's add all the amenities clusters and specifications features to base model and evaluate performance.
mod2 = smf.ols(formula='log_wap ~ max_size + C(mm_cluster) + bp1 + bp2 + bp3 + C(launch_year) + C(proj_bedrooms_clean) + C(unit_type) + C(am_water)+ C(am_security)+ C(am_indoor_game) + C(am_outdoor_game)+ C(am_sewage) + C(am_roads) + C(am_cyber) + C(am_power) + C(am_religion) + C(am_space) + C(am_finance) + C(am_firefighting) + C(am_health_related) + C(am_car_parking) + C(am_education) + C(am_maintenance) + C(am_social_gatherings) + C(am_lift) + C(am_shop) + C(sp_balcony) + C(sp_kitchen) + C(sp_livdin) + C(sp_mas_bed) + C(sp_oth_bed) + C(sp_toilets) + C(sp_internal) + C(sp_main) + C(sp_interior) + C(sp_switches) + C(sp_windows) + C(sp_wiring) + C(sp_exterior) + C(sp_frame) + C(sp_points) + C(sp_lobby) + C(sp_cement)', data=df1).fit()
print(mod2.summary())
residual_log (mod2, df1, 'residual_log_mod2', 'residual_mod2')
percentage_error_projects (df1, 'residual_mod2','percentage_error_mod2', 'percentage_error_interval_mod2')
# Mean Absolute Percentage Error for model including amenities specifications and amenities
np.absolute(df1['percentage_error_mod2']).mean()
fig,ax=plt.subplots(figsize=(10,10))
#Cook's Distance Plot for model including specifications & amenities
fig=sm.graphics.influence_plot(mod2, ax=ax, color='b')
#QQ Plot for absolute residual of model including specifications & amenities
fig = sm.qqplot(df1['residual_log_mod2'], fit=True, line='45')
plt.show()
g = sns.FacetGrid(df1, col="wap_interval")
g = g.map(plt.hist, "residual_mod2").set(ylim=(0, 600))
A two way chi-square test
df1['wap_level']=0
df1['residual_abs_level']=df1['residual_abs']=np.absolute(df1['residual'])
df1.loc[df1['wap'] > 20000, 'wap_level']= "High" #Splitting wap into labelled categories
df1.loc[(df1['wap'] >= 4000) & (df1['wap'] <= 20000),'wap_level']= "Medium"
df1.loc[df1['wap'] < 4000, 'wap_level']= "Low"
#Splitting absolute residual values of base model into categories
df1.loc[df1['residual_abs'] > 2500, 'residual_abs_level']= "High"
df1.loc[(df1['residual_abs'] >= 300) & (df1['residual_abs'] <= 2500),'residual_abs_level']= "Medium"
df1.loc[(df1['residual_abs'] < 300), 'residual_abs_level']= "Low"
Most projects fall under medium category for both prices and residual level.
df1['residual_abs_level'].value_counts()
df1['wap_level'].value_counts()
Chi-square test involving amenities clusters and absolute residual level.
print("-----Chi-Square Two-Way P-Values of amenities clusters vs residual of base model-----")
for i in ['am_water','am_security','am_outdoor_game','am_indoor_game','am_sewage','am_roads','am_cyber','am_power','am_religion','am_space','am_finance','am_firefighting','am_health_related','am_car_parking','am_education','am_maintenance','am_social_gatherings','am_lift','am_shop']:
cont=pd.crosstab(df1['residual_abs_level'],df1[i]) #Contingency table
g, p, dof, expctd = chi2_contingency(cont)
print(i,": ", p)
Chose top 6 clusters with lowest p-values: am_cyber, am_education, am_finance, am_health_related, am_indoor_game, am_security
Now, do a similar chi-square test for specifications.
print("-----Chi-Square Two-Way P-Values of specifications vs residual of base model-----")
for i in list(specifications.columns):
cont=pd.crosstab(df1['residual_abs_level'],df1[i])
g, p, dof, expctd = chi2_contingency(cont) # Continency table
print(i,": ", p)
At 1% significance level, the following specifications are dependent on base model residuals:
sp_interior, sp_main, sp_exterior, sp_points, sp_cement
Let's reduce the number of amenities and specifications to selected few to the base model.
selected = smf.ols(formula='log_wap ~ max_size + C(mm_cluster) + C(proj_bedrooms_clean) + bp1 + bp2 + bp3 + C(launch_year) + C(unit_type) + C(am_cyber) + C(am_education) + C(am_finance) + C(am_security) + C(am_health_related) + C(am_indoor_game) + C(sp_interior) + C(sp_main) + C(sp_exterior) + C(sp_points) + C(sp_cement)', data=df1).fit()
print(selected.summary())
residual_log (mod2, df1, 'residual_log_selected', 'residual_selected')
percentage_error_projects (df1, 'residual_selected','percentage_error_selected', 'percentage_error_interval_selected')
# Mean Absolute Percentage Error for model with selected amenities specifications and amenities
np.absolute(df1['percentage_error_selected']).mean()
sns.FacetGrid(df1, col="wap_interval").map(plt.hist, "residual_selected")
It's good that clusters of micromarkets are able to explain price variations.
sns.relplot(x="wap", y="percentage_error_selected", hue="zone", data=df1)
sns.relplot(x="wap", y="percentage_error_selected", hue="wap_level", data=df1)
sns.relplot(x="wap", y="percentage_error_selected", hue="residual_abs_level", data=df1)
Large errors in prediction occur in rare instances. Same goes for low errors at high prices.
f, ax = plt.subplots(figsize=(10,5))
sns.boxplot(x="mm_cluster", y="percentage_error_selected", data=df1)
Let's observe how wap and absolute residual values of base model varies across different categories for different amenities and specifications. 0 means no amenities are present in an amenity cluster while 1 implies at least one amenity is present in that cluster.
g = sns.catplot(x="am_cyber", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_cyber", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_education", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_education", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_finance", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_finance", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_health_related", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_health_related", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_security", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_security", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_indoor_game", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="am_indoor_game", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
Weighted average price and price residuals vary the most for amenities falling under high-level category of residuals.
Amenities with low p-values as shown during chi-square test are showing significant variations in residuals and weighted average prices across different categories.
Weighted average prices are influenced by certain amenities more than others. The presence and absence of specifications also makes a difference to prices.
g = sns.catplot(x="sp_interior", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_interior", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_main", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_main", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_exterior", y="wap",col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_exterior", y="residual",col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_points", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_points", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_cement", y="wap", col="wap_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
g = sns.catplot(x="sp_cement", y="residual", col="residual_abs_level", data=df1, kind="box", height=5, aspect=.7,palette='Set3')
Conclusion: Same as that for amenities. Certain specifications impact project prices more than others as seen from chi-square tests. The presence and absence of specifications also makes a difference to prices.
How do model performances vary for high and low prices?
low_cutoff = df1.loc[df1['wap'] <= 10000, :] # Low-Cutoff prices: wap<=10000
high_cutoff = df1.loc[df1['wap'] > 10000, :] # High-Cutoff prices: wap>10000
hc = smf.ols(formula='log_wap ~ max_size + C(mm_cluster) + bp1 + bp2 + bp3 + C(launch_year) + C(proj_bedrooms_clean) + C(unit_type) + C(am_cyber) + C(am_education) + C(am_finance) + C(am_security) + C(am_health_related) + C(am_indoor_game) + C(sp_interior) + C(sp_main) + C(sp_exterior) + C(sp_points) + C(sp_cement)', data=high_cutoff).fit()
print(hc.summary())
residual_log (hc, high_cutoff, 'residual_log_hc', 'residual_hc')
percentage_error_projects (high_cutoff, 'residual_hc','percentage_error_hc', 'percentage_error_interval_hc')
# Mean Absolute Percentage Error for high cut off price model residuals
np.absolute(high_cutoff['percentage_error_hc']).mean()
lc = smf.ols(formula='log_wap ~ max_size + C(mm_cluster) + C(proj_bedrooms_clean) + bp1 + bp2 + bp3 + C(launch_year)+ C(unit_type) + C(am_cyber) + C(am_education) + C(am_finance) + C(am_security) + C(am_health_related) + C(am_indoor_game) + C(sp_interior) + C(sp_main) + C(sp_exterior) + C(sp_points) + C(sp_cement)', data=low_cutoff).fit()
print(lc.summary())
residual_log (lc, low_cutoff, 'residual_log_lc', 'residual_lc') #Residual summary statistics for low cutoff price model
percentage_error_projects (low_cutoff, 'residual_lc','percentage_error_lc', 'percentage_error_interval_lc')
# Mean Absolute Percentage Error of low cutoff price model residuals
np.absolute(low_cutoff['percentage_error_lc']).mean()
The model for low cutoff prices is better in predicting weighted average prices. This shows that low priced projects can be better predicted with lesser errors than higher priced projects.
include=['log_wap','max_size','mm_cluster', 'bp1','bp2','bp3', 'unit_type','proj_bedrooms_clean','launch_year', 'am_security', 'am_indoor_game', 'am_cyber', 'am_finance', 'am_health_related','am_education', 'sp_interior', 'sp_main', 'sp_exterior', 'sp_points','sp_cement']
df2=pd.DataFrame(df1[include])
for i in ['unit_type','proj_bedrooms_clean','launch_year', 'am_security', 'am_indoor_game', 'am_cyber', 'am_finance', 'am_health_related', 'am_education', 'sp_interior', 'sp_main', 'sp_exterior', 'sp_points', 'sp_cement']:
label_encoder(df2, i)
type_converter (df2, i, 'int')
df2.head()
from sklearn.model_selection import train_test_split
target = 'log_wap'
features = df2.columns != target
x = df2.loc[:,features]
y = df2[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=8)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=2000, n_jobs=5, max_depth=12,random_state=4,criterion='mae')
rf.fit(x_train, y_train)
def model_R2 (model,x_testdataframe,y_testdataframe):
from sklearn.metrics import r2_score
y_pred=model.predict(x_testdataframe)
return (print("R2 Score -", r2_score(y_testdataframe, y_pred)))
print("Random Forest Regressor:")
model_R2 (rf, x_test, y_test)
def plot_feature_importances (model, kind, title, color, dataframe):
importances = pd.Series(data=model.feature_importances_, index= dataframe.columns)
# Sort importances
importances_sorted = importances.sort_values()
# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind=kind, color=color)
plt.title(title)
return(plt.show())
plot_feature_importances (rf, 'barh', 'Random Forest Feature Importances', 'maroon', x_train)
def compute_residual (model, column, dataframe,x_test_df,y_test_df):
y_pred=model.predict(x_test_df)
y_pred_wap=np.exp(y_pred)
y_actual_wap=np.exp(y_test_df)
y_resid = y_actual_wap - y_pred_wap
dataframe[column]=pd.DataFrame(y_resid)
return(dataframe[column].describe())
compute_residual (rf, 'residual_rf', df1, x_test, y_test)
percentage_error_projects (df1, 'residual_rf', 'percentage_error_rf', 'percentage_error_interval_rf')
# Mean Absolute Percentage Error for Random Forest Regressor
print('MAPE of Random Forest Regressor:', np.absolute(df1['percentage_error_rf']).mean())
sns.FacetGrid(df1, col="wap_interval",margin_titles=True).map(plt.hist, "residual_rf")
sns.relplot(x="wap", y="percentage_error_rf", hue="zone", data=df1)
sns.relplot(x="wap", y="percentage_error_rf", hue="residual_abs_level", data=df1)
sns.relplot(x="wap", y="percentage_error_rf", hue="wap_level", data=df1)
Errors are concentrated at smaller levels regardless of the categories they belong to. Due to relatively higher level of residuals, scatter plots each category are widely spread out compared to linear regression plot.
f, ax = plt.subplots(figsize=(10,5))
sns.boxplot(x="mm_cluster", y="percentage_error_rf", data=df1)
The Random Forest regressor model is performing even more poorly, indicating greater degree of residuals. Percentage residual is starting to show greater variance across clusters.
from xgboost import XGBRegressor
xg_reg=XGBRegressor(objective='reg:linear', n_estimators=170, learning_rate=0.1, seed=100,base_score=0.1,reg_alpha=0.4,reg_lambda=0.3)
xg_reg.fit(x_train, y_train)
from sklearn.model_selection import cross_validate
# cross validate
scores_xgreg = cross_validate(xg_reg, x_train, y_train, cv=8, return_train_score=True,return_estimator=False)
scores_xgreg =pd.DataFrame(scores_xgreg )
scores_xgreg
print('Mean Test Score for XG Boost Regressor: ', scores_xgreg ['test_score'].mean())
print('Mean Train Score for XG Boost Regressor: ', scores_xgreg ['train_score'].mean())
plot_feature_importances (xg_reg, 'barh', 'XG Boost Feature Importances', 'purple', x_train)
Just like random forest, micromarket cluster remains the favourite feature while max_size and bedroom by typologies continue to top the list as well. Feature map tells us different importance of predictor variables being considered.
print("XGBoost Regressor:")
model_R2 (xg_reg, x_test, y_test)
compute_residual (xg_reg, 'residual_xgreg', df1, x_test, y_test)
percentage_error_projects (df1, 'residual_xgreg', 'percentage_error_xgreg', 'percentage_error_interval_xgreg')
# Mean Absolute Percentage Error for Extreme Gradient Boost Regressor
print('MAPE of Gradient Boost Regressor:', np.absolute(df1['percentage_error_xgreg']).mean())
sns.FacetGrid(df1, col="wap_interval",margin_titles=True).map(plt.hist, "residual_xgreg")
sns.relplot(x="wap", y="percentage_error_xgreg", hue="zone", data=df1)
sns.relplot(x="wap", y="percentage_error_xgreg", hue="wap_level", data=df1)
sns.relplot(x="wap", y="percentage_error_xgreg", hue="residual_abs_level", data=df1)
Compared to Random Forest, errors are lower overall for XGBoost Regressor. However, it is unable to beat multiple linear regression model.
f, ax = plt.subplots(figsize=(10,5))
sns.boxplot(x="mm_cluster", y="percentage_error_xgreg", data=df1)
include=['log_wap','max_size','min_size','mm_cluster','bp1','bp2','bp3','bp4','bp5','bp6','bp7','bp8','b1','b2','b3','b4','b5','b6','b7','b8','unit_type','construction_status','proj_bedrooms_clean','launch_year']
amen=['am_water','am_security','am_indoor_game','am_outdoor_game','am_sewage','am_roads','am_cyber','am_power','am_religion','am_space','am_finance','am_firefighting','am_health_related','am_car_parking','am_education','am_maintenance','am_social_gatherings','am_lift','am_shop']
df3=pd.concat([df1[include],df1[amen],specifications],axis=1)
for i in list(df3.columns)[20:]:
label_encoder(df3, i)
type_converter (df3, i, 'int')
df3.head()
dependent = 'log_wap'
independent = df3.columns != dependent
x1 = df3.loc[:,independent]
y1 = df3[dependent]
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, random_state=8) # Split into train and test dataset
xg_reg_all=XGBRegressor(objective='reg:linear', n_estimators=200, learning_rate=0.095, seed=100,base_score=0,reg_alpha=0.3)
xg_reg_all.fit(x1_train, y1_train) # Fit Extreme Gradient Boost Regressor into training data
# cross validate
scores_xgreg_all = cross_validate(xg_reg_all, x1_train, y1_train, cv=8, return_train_score=True,return_estimator=False)
scores_xgreg_all = pd.DataFrame(scores_xgreg_all)
scores_xgreg_all
print('Mean Test Score for XG Boost Regressor using all features: ', scores_xgreg_all ['test_score'].mean())
print('Mean Train Score for XG Boost Regressor using all features: ', scores_xgreg_all ['train_score'].mean())
print("XGBoost Regressor for all features:")
model_R2 (xg_reg_all, x1_test, y1_test)
fig,ax=plt.subplots(figsize=(10,12))
fig=plot_feature_importances (xg_reg_all, 'barh', 'XG Boost Feature Importances using all features', 'purple', x1_train)
Not all features are important to the XG Boost Regressor model. Most of the features in amenities and specifications were not even considered.
compute_residual (xg_reg_all, 'residual_xgreg_all', df1, x1_test, y1_test)
# Error Distribution for Extreme Gradient Boost Regressor using all features
percentage_error_projects (df1, 'residual_xgreg_all', 'percentage_error_xgreg_all', 'percentage_error_interval_xgreg_all')
# Mean Absolute Percentage Error for Extreme Gradient Boost Regressor using all features
print('MAPE of XGBoost Regressor using all features:', np.absolute(df1['percentage_error_xgreg_all']).mean())
Errors are slightly higher when all the models are fed to the XG Boost Regressor model.
sns.FacetGrid(df1, col="wap_interval",margin_titles=True).map(plt.hist, "residual_xgreg_all")
sns.relplot(x="wap", y="percentage_error_xgreg_all", hue="zone", data=df1)
sns.relplot(x="wap", y="percentage_error_xgreg_all", hue="wap_level", data=df1)
sns.relplot(x="wap", y="percentage_error_xgreg_all", hue="residual_abs_level", data=df1)
Errors slightly larger than XG Boost model with selected amenities and specifications. Thus, it is important to be selective about the features we feed that can be good predictors.
f, ax = plt.subplots(figsize=(10,5))
sns.boxplot(x="mm_cluster", y="percentage_error_xgreg_all", data=df1)